library(tidyverse)
library(stringr)
library(caret)
library(plotly)
library(ggthemes)
library(GGally)
library(class)
library(e1071)
library(stringr)
library(maps)
In this EDA(Exploratory Data Analysis) we will conduct an analysis about the beers sold by Budweiser across different states in the United States. In addition, we will also show the different beer’s styles managed by each brewery in each state which will give us a glimpse about the assortment breath of every Beer store
The whole idea of this analysis is to focus on the different business areas that the Budweiser corporation could improve in terms of customer service and stock availability
Image 1: Logo Budweiser
#Cleaning ABV using mean
df_beers_cl0 = df_beers
nr_mean_abv = mean(df_beers_cl0[!is.na(df_beers_cl0$ABV),]$ABV) #Calculate the mean
length_abv = length(df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV)
if(length_abv > 0){
df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV = nr_mean_abv #Replacing Missing value with mean
}
#Cleaning IBU using mean
df_beers_cl1 = df_beers
nr_mean_ibu = mean(df_beers_cl1[!is.na(df_beers_cl1$IBU),]$IBU)
length_ibu = length(df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU)
if(length_ibu > 0){
df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU = nr_mean_ibu
}
#Cleaning using KnnInpute
##Creating the model K = 20
knn_imp_model <- preProcess(df_beers_cl0 %>%
select(ABV,IBU),
method = c("knnImpute"),
k = 20,
knnSummary = mean)
#Running the model
df_beers_unp <- predict(knn_imp_model, df_beers_cl0,na.action = na.pass)
#The beer data set will be normalized. To de-normalize and get the original data back:
procNames <- data.frame(col = names(knn_imp_model$mean), mean = knn_imp_model$mean, sd = knn_imp_model$std)
for(i in procNames$col){
df_beers_unp[i] <- df_beers_unp[i]*knn_imp_model$std[i]+knn_imp_model$mean[i]
}
nr_rows = dim(df_beers)[1]
df_beers %>% ggplot(aes(x=IBU))+geom_histogram(fill="black",binwidth = 3)+
labs(title="IBU before performing data cleansing",x="IBU(International bitterness Unit)",y="Observation number")
## Warning: Removed 1005 rows containing non-finite values (`stat_bin()`).
df_beers %>% ggplot(aes(x=ABV))+geom_histogram(fill="black")+
labs(title="ABV before performing data cleansing",x="ABV(Alcohol by Volume)",y="Observation number")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (`stat_bin()`).
#Plotting IBU using Mean
df_beers_cl1 %>% ggplot(aes(x=IBU))+geom_histogram(fill="blue")+
labs(title="IBU after performing data cleansing",subtitle = "Calculation using Mean",x="IBU(International bitterness Unit)",y="Observation number")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Plotting IBU KNNImputation and ABV after data cleansing
df_beers_unp %>% ggplot(aes(x=IBU))+geom_histogram(fill="blue",col="black",binwidth = 3)+labs(title="IBU after performing data cleansing",subtitle = "Calculation using KnnImputation",x="IBU(International bitterness Unit)",y="Observation number")
df_beers_unp %>% ggplot(aes(x=ABV))+geom_histogram(fill="blue",col="black")+labs(title="ABV after performing data cleansing",x="ABV(Alcohol by Volume)",y="Observation number")+geom_density(lwd = 1.2,
linetype = 2,
colour = 2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
skewness(df_beers_unp$ABV)
## [1] 0.9698248
kurtosis(df_beers_unp$ABV)
## [1] 1.245737
summary(df_beers_unp$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00100 0.05000 0.05700 0.05977 0.06700 0.12800
cor(df_beers_unp$ABV,df_beers_unp$IBU)
## [1] 0.7390397
df_summary = df_breweries_2 %>% group_by(State,Name_State) %>% summarize(NumberBreweries = n())
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
knitr::kable(
df_summary,
caption = "Number of Beers by State"
)
| State | Name_State | NumberBreweries |
|---|---|---|
| AK | Alaska | 7 |
| AL | Alabama | 3 |
| AR | Arkansas | 2 |
| AZ | Arizona | 11 |
| CA | California | 39 |
| CO | Colorado | 47 |
| CT | Connecticut | 8 |
| DC | District of Columbia | 1 |
| DE | Delaware | 2 |
| FL | Florida | 15 |
| GA | Georgia | 7 |
| HI | Hawaii | 4 |
| IA | Iowa | 5 |
| ID | Idaho | 5 |
| IL | Illinois | 18 |
| IN | Indiana | 22 |
| KS | Kansas | 3 |
| KY | Kentucky | 4 |
| LA | Louisiana | 5 |
| MA | Massachusetts | 23 |
| MD | Maryland | 7 |
| ME | Maine | 9 |
| MI | Michigan | 32 |
| MN | Minnesota | 12 |
| MO | Missouri | 9 |
| MS | Mississippi | 2 |
| MT | Montana | 9 |
| NC | North Carolina | 19 |
| ND | North Dakota | 1 |
| NE | Nebraska | 5 |
| NH | New Hampshire | 3 |
| NJ | New Jersey | 3 |
| NM | New Mexico | 4 |
| NV | Nevada | 2 |
| NY | New York | 16 |
| OH | Ohio | 15 |
| OK | Oklahoma | 6 |
| OR | Oregon | 29 |
| PA | Pennsylvania | 25 |
| RI | Rhode Island | 5 |
| SC | South Carolina | 4 |
| SD | South Dakota | 1 |
| TN | Tennessee | 3 |
| TX | Texas | 28 |
| UT | Utah | 4 |
| VA | Virginia | 16 |
| VT | Vermont | 10 |
| WA | Washington | 23 |
| WI | Wisconsin | 20 |
| WV | West Virginia | 1 |
| WY | Wyoming | 4 |
df_beerbre_unp = merge(df_beers_unp,df_breweries_2,by.x = "Brewery_id",by.y = "Brew_ID")
df_acom_bebrew_1 = df_beerbre_unp %>% group_by(State,Name_State) %>% summarize(Median_ABV = mean(ABV),Median_IBU = mean(IBU))
# df_acom_bebrew_1 %>% ggplot(aes(x=State,color=State))+geom_bar()+labs(title = "Alcohol by Volume",subtitle = "Alcohol by Volume average by State")+coord_flip()
df_sort_1 = arrange(df_beerbre_unp,desc(ABV)) %>% head(n = 1)
sprintf("The state that has the maximum ABV is %s-%f",df_sort_1$Name_State,df_sort_1$ABV)
## [1] "The state that has the maximum ABV is Colorado-0.128000"
df_beerbre_unp %>% select(IBU,ABV) %>% ggpairs(columnLabels = c("ABV","IBU"))+
labs(title="Relationship between the bitterness of the beer(IBU) and its alcoholic content(ABV)")
df_beerbre_fil0 = filter(df_beerbre_unp,str_detect(df_beerbre_unp$Style,regex("IPA|ALE",ignore_case = TRUE)))
#We create the column Type for classifying IPA Beer Styles and the Rest
df_beerbre_fil1 = mutate(df_beerbre_fil0,Type= ifelse(str_detect(Style,regex("IPA",ignore_case = TRUE)),"IPA","ALE"))
#We plot the relationship between IBU and ABV by Beer Type
df_beerbre_fil1 %>% ggplot(aes(x=ABV,y=IBU,color=Type))+geom_point()+
labs(title = "Relationship IBU and AVB",subtitle = "Relationship IBU/AVB by Beer Type")+xlab("Alcohol by Volume (ABV)")+ylab("International Bitterness Unit(IBU)")
#Initializing the model
nr_percentage = 0.7
nr_observations = nrow(df_beerbre_fil1)
nr_k = 5
#Creating the training and Testing dataset
lst_index_beer = sample(nr_observations,round(nr_observations * nr_percentage))
df_train = df_beerbre_fil1[lst_index_beer,]
df_test = df_beerbre_fil1[-lst_index_beer,]
#Running the model
knn_result = knn(df_train[,c(4,5)],df_test[,c(4,5)],df_train$Type,prob = TRUE,k=nr_k)
co_table = table(knn_result,df_test$Type)
confusionMatrix(co_table)
## Confusion Matrix and Statistics
##
##
## knn_result ALE IPA
## ALE 261 35
## IPA 42 135
##
## Accuracy : 0.8372
## 95% CI : (0.8008, 0.8693)
## No Information Rate : 0.6406
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6496
##
## Mcnemar's Test P-Value : 0.4941
##
## Sensitivity : 0.8614
## Specificity : 0.7941
## Pos Pred Value : 0.8818
## Neg Pred Value : 0.7627
## Prevalence : 0.6406
## Detection Rate : 0.5518
## Detection Prevalence : 0.6258
## Balanced Accuracy : 0.8278
##
## 'Positive' Class : ALE
##
#We accomulate the number of beers by style in each state
df_sum_1 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n())
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
df_sum_1 = mutate(df_sum_1,flavor_type = ifelse(str_detect(Style,regex("IPA",ignore_case = TRUE)),"Ipa","Pale"))
df_sum_1 = mutate(df_sum_1,general_stype = ifelse(str_detect(Style,regex("lager",ignore_case = TRUE)),"Lager","Ale"))
#We accomulate the number of beers by style but only the most popular(Beer styles with more than five beers)
df_sum_2 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n()) %>% filter(count_style>5) %>% mutate(Name_State_2=str_trim(str_to_lower(Name_State)))
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
df_sum_3 = group_by(df_beerbre_unp,State,Name_State) %>% summarize(count_style = n()) %>% mutate(Name_State_2=str_trim(str_to_lower(Name_State)))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
#We accomulate by Style and in addition create a column with the attribute Lager
df_sum_4 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n()) %>% mutate(Name_State_2=str_trim(str_to_lower(Name_State)),Lager=ifelse(str_detect(Style,regex("lager",ignore_case = TRUE)),"X","")) %>%
filter(count_style>5)
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
#Plotting the Popular Beer Style by State x= Style, Y = Count of beer , color = Name of the State
gfr = df_sum_2 %>% arrange(Name_State) %>% ggplot(aes(x=Style,y=count_style,fill = Name_State ))+geom_bar(stat = "identity",position = "stack")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = "left",text = element_text(size = 10))+labs(title = "Most popular beer style in USA by State",subtitle = "Style with a very few beers are not shown in this analysis")+xlab("USA State")+ylab("Number of Beers by Style")
ggplotly(gfr)
gfr = df_sum_4 %>% arrange(Name_State) %>% ggplot(aes(x=Name_State,y=count_style,fill = Lager))+geom_bar(stat = "identity",position = "stack")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = "left",text = element_text(size = 10))
#Dataframe Map by State
df_usa_states_1 = map_data("state")
df_usa_states_2 = map_data("state")
#We calculate the mean of every state in order to locate the Label in the center
df_usa_states_3 = df_usa_states_1 %>% group_by(region) %>% summarize(lat=mean(lat),long=mean(long)) %>% mutate(AB_State = str_to_upper(str_sub(region,1,2)))
#Merge of the beer data and the coordinates and locations
df_state_beer_1 = merge(df_sum_2,df_usa_states_1,by.x = "Name_State_2",by.y = "region")
df_state_beer_2 = merge(df_usa_states_1,df_sum_3,by.x="region",by.y = "Name_State_2")
df_state_beer_3 = merge(df_usa_states_3,df_sum_3,by.x="region",by.y = "Name_State_2")
#Plotting the MAP and The label of each state
map = df_usa_states_1 %>% ggplot()+geom_polygon(aes(x=long, y=lat, group = group,fill=region))+
theme(legend.position = "none")+ggrepel::geom_label_repel(aes(x=long, y=lat,label=sprintf("%s,Beers:%i",Name_State,count_style),alpha=0.8),size=2,data=df_state_beer_3)+
labs(title = "Number of Beer Types by States in USA",subtitle = "Assorment of Beer Types by State in the USA")+xlab("Latitude")+ylab("Longitude")
map
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Comment on the summary statistics and distribution of the ABV variable
Accordint to the histogram we can notice that the data seems normally distributed